suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_m3C_sites/RNAfold/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/RNAfold/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

paste_wd <- function(path) {
  
  paste0(wd, path)
  
}

calc_base_position <- function(df) {
  
  df |> 
    mutate(transcript_seq = str_split(transcript_seq, '')) |> 
    unnest(transcript_seq) |>
    group_by(transcript_id) |> 
    mutate(position = row_number() - min(row_number()) + 1) |> 
    ungroup() |> 
    dplyr::rename(base = transcript_seq)
  
}

add_structureinfo <- function(df) {
  
  df |> 
    left_join(rnafold_predicted_MFE) |> 
    mutate(
      pos_dotbracket = str_sub(predicted_MFE, position, position),
      pos_pairprob   = str_sub(pair_prob, position, position)
    )
  
}

select_cols <- function(df) {
  df |> select(transcript_id, gene_name, genetype2, starts_with('pos'))
}

select_cols_mRNA <- function(df) {
  df |> select(transcript_id, gene_name, genetype2, kmer_region, starts_with('pos'))
}

add_structureinfo_mRNAs <- function(df) {
  
  df |> 
    dplyr::rename(position = kmer_middle) |> 
    add_structureinfo() |> 
    select_cols_mRNA()
}

add_m3CRNA_info <- function(df) {
  
  df |> 
    left_join(
      methylated_RNAs_C_positions |> 
        select(transcript_id) |> 
        distinct() |> 
        mutate(m3CRNA = 'm3C_RNA')
    ) |> 
    replace_na(list(m3CRNA = 'others'))
  
}

calc_structuredness_Espresso_AsPC1_RNAs <- function() {
  
  read_tsv('/Volumes/Mitsu_NGS_2/METTL2A/RNALfold/summary.tsv', 
           col_names = c('transcript_id', 'MFE')) |> 
    mutate(
      transcript_id = str_remove_all(transcript_id, 'RNALfold/|.lfold'),
      MFE = parse_number(MFE)
    ) |> 
    inner_join(
      read_tsv(
        paste_wd('Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz')
      ) |> 
        select(-transcript_seq)
    ) |> 
    mutate(structuredness = -MFE / transcript_length)
  
} 

add_genetype2 <- function(df) {
  
  df |> 
    mutate(
      genetype2 = case_when(
        gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
        gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
        .default = gene_type
      )
    )
  
}

reorder_genetype2 <- function(df) {
  df |> mutate(genetype2 = factor(genetype2, levels = c('mRNA', 'mt-mRNA', 'Mt_rRNA')))
}


reorder_kmer_region <- function(df) {
  df |> 
    mutate(
      kmer_region = factor(kmer_region, 
                           levels = c('threeprimeUTR', 'CDS', 'fiveprimeUTR'))
    )
}

Read data

rnafold_predicted_MFE <-  read_tsv(
  '/Volumes/Mitsu_NGS_2/METTL2A/RNAfold/m3Crnas/summary.tsv', 
  col_names = c('transcript_id', 'predicted_MFE', 'pair_prob')
)
## Rows: 71 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): transcript_id, predicted_MFE, pair_prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rnafold_predicted_MFE
## # A tibble: 71 × 3
##    transcript_id      predicted_MFE                                    pair_prob
##    <chr>              <chr>                                            <chr>    
##  1 ENST00000009589.8  .((((((((((..(((((((.((.((((.(((.....((((((...(… .(((((((…
##  2 ENST00000199764.7  ..((((((((((((.(((.((((((.....((((.(((.((((((.(… .{((((((…
##  3 ENST00000202773.14 (((((((((((((....))))))(((((...................… ,{({{{,(…
##  4 ENST00000215754.8  ....(((((.((....)).)))))...((.((((((((((.(.((((… .,,.((((…
##  5 ENST00000229239.10 (((((((((((((((((((.(((..((.(((.((((((((((.(.((… ((((((((…
##  6 ENST00000230050.4  ....((((.((((((((((...(((.((.(.(((((.((..((((((… ,,{,{(((…
##  7 ENST00000233143.6  .....((((....))))((((((((.((((.((........))))))… ...,,{((…
##  8 ENST00000234875.9  .........((((.((..((((((((((..........))).)))))… .{{{{{{,…
##  9 ENST00000243997.8  ...((((((((((((.((((.((....((((.......))))..)).… ...(((((…
## 10 ENST00000254810.8  ((((((.((((((((((.(.((((...((((.((.....((((((((… ,,{{{{.(…
## # ℹ 61 more rows
m3C_sites <- 
  read_tsv(
    paste_wd(
      'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv'
    )
  ) |> 
  dplyr::rename(position = kmer_middle) |> 
  add_structureinfo()
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
m3C_sites
## # A tibble: 489 × 17
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCG            58       62
##  5 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCT            76       80
##  6 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ATCAA            94       98
##  7 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GCCAC           149      153
##  8 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCC           154      158
##  9 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCC           155      159
## 10 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCA           156      160
## # ℹ 479 more rows
## # ℹ 10 more variables: position <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   predicted_MFE <chr>, pair_prob <chr>, pos_dotbracket <chr>,
## #   pos_pairprob <chr>
methylated_RNAs_C_positions <- 
  read_tsv(
    'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
  ) |> 
  right_join(m3C_sites |> select(transcript_id, gene_name, genetype2) |> distinct()) |> 
  calc_base_position() |> 
  filter(base == 'C') |> 
  add_structureinfo()
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_C_positions
## # A tibble: 24,117 × 10
##    transcript_id     base  transcript_length gene_name genetype2 position
##    <chr>             <chr>             <dbl> <chr>     <chr>        <dbl>
##  1 ENST00000429711.7 C                  2094 RPL32     mRNA             3
##  2 ENST00000429711.7 C                  2094 RPL32     mRNA             4
##  3 ENST00000429711.7 C                  2094 RPL32     mRNA             5
##  4 ENST00000429711.7 C                  2094 RPL32     mRNA             9
##  5 ENST00000429711.7 C                  2094 RPL32     mRNA            11
##  6 ENST00000429711.7 C                  2094 RPL32     mRNA            13
##  7 ENST00000429711.7 C                  2094 RPL32     mRNA            14
##  8 ENST00000429711.7 C                  2094 RPL32     mRNA            16
##  9 ENST00000429711.7 C                  2094 RPL32     mRNA            17
## 10 ENST00000429711.7 C                  2094 RPL32     mRNA            20
## # ℹ 24,107 more rows
## # ℹ 4 more variables: predicted_MFE <chr>, pair_prob <chr>,
## #   pos_dotbracket <chr>, pos_pairprob <chr>
allC_methylatedRNAs_regioninfo <- 
  read_tsv(
    'Tables/DRS_m3C_sites/Metagene_CDS/allC_methylatedRNAs_regioninfo_2024-06-05.tsv' |> 
      paste_wd()
  )
## Rows: 22334 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, base, kmer_region, gene_name, gene_type, genetype2
## dbl (6): kmer_middle, start, end, thickStart, thickEnd, length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allC_methylatedRNAs_regioninfo
## # A tibble: 22,334 × 12
##    transcript_id     base  kmer_middle start   end thickStart thickEnd length
##    <chr>             <chr>       <dbl> <dbl> <dbl>      <dbl>    <dbl>  <dbl>
##  1 ENST00000429711.7 C               3     0  2094         77      482   2094
##  2 ENST00000429711.7 C               4     0  2094         77      482   2094
##  3 ENST00000429711.7 C               5     0  2094         77      482   2094
##  4 ENST00000429711.7 C               9     0  2094         77      482   2094
##  5 ENST00000429711.7 C              11     0  2094         77      482   2094
##  6 ENST00000429711.7 C              13     0  2094         77      482   2094
##  7 ENST00000429711.7 C              14     0  2094         77      482   2094
##  8 ENST00000429711.7 C              16     0  2094         77      482   2094
##  9 ENST00000429711.7 C              17     0  2094         77      482   2094
## 10 ENST00000429711.7 C              20     0  2094         77      482   2094
## # ℹ 22,324 more rows
## # ℹ 4 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## #   genetype2 <chr>
DRS_methylated_positions_CDSpos <- 
  read_tsv(
    'Tables/DRS_m3C_sites/Metagene_CDS/DRS_methylated_positions_CDSpos_2024-06-05.tsv' |> 
      paste_wd()
  )
## Rows: 436 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (7): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2,...
## dbl (11): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kme...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions_CDSpos
## # A tibble: 436 × 18
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCA            33       37
##  5 ENST00000361390.2 MT-ND1    chrM    protein_cod… CCCCT           123      127
##  6 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCT           141      145
##  7 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCG           186      190
##  8 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCT           205      209
##  9 ENST00000361390.2 MT-ND1    chrM    protein_cod… CCCCC           260      264
## 10 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCTC           322      326
## # ℹ 426 more rows
## # ℹ 11 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>, kmer_region <chr>
gencode_annotation <- 
  read_tsv(
    'Tables/Database/gencode.v43.annotation.tsv' |> paste_wd()
  ) |>
  dplyr::rename(seqname = seq_id) |> 
  add_genetype2() |> 
  filter(primary_tag == 'transcript')
## Rows: 3422892 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): seq_id, source_tag, primary_tag, score, frame, artif_dupl, ccdsid,...
## dbl  (4): start, end, strand, level
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_annotation <- 
  read_tsv('/Volumes/Mitsu_NGS_2/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1_annotation_standardized.tsv')
## Rows: 285554 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (10): seq_id, source_tag, primary_tag, score, frame, exon_number, gene_i...
## dbl  (3): start, end, strand
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  #read_gtf_transcripts('/Volumes/Mitsu_NGS_2/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1_annotation_geneplus.gtf')
espresso_AsPC1_annotation 
## # A tibble: 285,554 × 13
##    seq_id source_tag      primary_tag start   end score strand frame exon_number
##    <chr>  <chr>           <chr>       <dbl> <dbl> <chr>  <dbl> <chr> <chr>      
##  1 chrM   annotated_isof… gene          577   647 .          1 .     N/A        
##  2 chrM   annotated_isof… transcript    577   647 .          1 .     N/A        
##  3 chrM   annotated_isof… exon          577   647 .          1 .     1          
##  4 chrM   annotated_isof… gene          648  1601 .          1 .     N/A        
##  5 chrM   annotated_isof… transcript    648  1601 .          1 .     N/A        
##  6 chrM   annotated_isof… exon          648  1601 .          1 .     1          
##  7 chrM   annotated_isof… gene         1671  3229 .          1 .     N/A        
##  8 chrM   annotated_isof… transcript   1671  3229 .          1 .     N/A        
##  9 chrM   annotated_isof… exon         1671  3229 .          1 .     1          
## 10 chrM   annotated_isof… gene         3307  4262 .          1 .     N/A        
## # ℹ 285,544 more rows
## # ℹ 4 more variables: gene_id <chr>, ID <chr>, Parent <chr>,
## #   transcript_id <chr>
structuredness_Espresso_AsPC1_RNAs <- 
  calc_structuredness_Espresso_AsPC1_RNAs()  |> 
  add_m3CRNA_info() |> 
  full_join(gencode_annotation)
## Rows: 36717 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, MFE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
structuredness_Espresso_AsPC1_RNAs
## # A tibble: 252,870 × 33
##    transcript_id         MFE transcript_length structuredness m3CRNA seqname
##    <chr>               <dbl>             <dbl>          <dbl> <chr>  <chr>  
##  1 ENST00000000233.10  -387.              1032          0.375 others chr7   
##  2 ENST00000000412.8   -674.              2450          0.275 others chr12  
##  3 ENST00000000442.11 -1014.              2274          0.446 others chr11  
##  4 ENST00000001008.6  -1110               3715          0.299 others chr12  
##  5 ENST00000002125.9   -565.              2184          0.259 others chr2   
##  6 ENST00000002165.11  -644.              2385          0.270 others chr6   
##  7 ENST00000002501.11  -784.              2056          0.381 others chr16  
##  8 ENST00000002596.6  -1885.              7160          0.263 others chr4   
##  9 ENST00000003100.13  -803.              3155          0.254 others chr7   
## 10 ENST00000003583.12  -750.              2544          0.295 others chr1   
## # ℹ 252,860 more rows
## # ℹ 27 more variables: source_tag <chr>, primary_tag <chr>, start <dbl>,
## #   end <dbl>, score <chr>, strand <dbl>, frame <chr>, artif_dupl <chr>,
## #   ccdsid <chr>, exon_id <chr>, exon_number <chr>, gene_id <chr>,
## #   gene_name <chr>, gene_type <chr>, havana_gene <chr>,
## #   havana_transcript <chr>, hgnc_id <chr>, ID <chr>, level <dbl>, ont <chr>,
## #   Parent <chr>, protein_id <chr>, tag <chr>, transcript_name <chr>, …
structuredness_Espresso_AsPC1_RNAs |> 
  filter(!is.na(transcript_type)) |> 
  filter(!is.na(m3CRNA)) |> 
  filter(genetype2 == 'mRNA' & transcript_type == 'protein_coding') |> 
  ggplot(aes(y = structuredness, x = m3CRNA)) +
  geom_boxplot() +
  #geom_density() +
  facet_wrap( ~ transcript_type)

structuredness_Espresso_AsPC1_RNAs_filtered <- 
  structuredness_Espresso_AsPC1_RNAs |> 
  filter(!is.na(transcript_type)) |> 
  filter(!is.na(m3CRNA)) |> 
  filter(transcript_type == 'protein_coding' | transcript_type == 'Mt_rRNA') 
structuredness_Espresso_AsPC1_RNAs_filtered |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/RNAfold/structuredness_Espresso_AsPC1_RNAs_filtered_2024-07-29.tsv.gz
## # A tibble: 19,786 × 33
##    transcript_id         MFE transcript_length structuredness m3CRNA seqname
##    <chr>               <dbl>             <dbl>          <dbl> <chr>  <chr>  
##  1 ENST00000000233.10  -387.              1032          0.375 others chr7   
##  2 ENST00000000412.8   -674.              2450          0.275 others chr12  
##  3 ENST00000000442.11 -1014.              2274          0.446 others chr11  
##  4 ENST00000001008.6  -1110               3715          0.299 others chr12  
##  5 ENST00000002125.9   -565.              2184          0.259 others chr2   
##  6 ENST00000002165.11  -644.              2385          0.270 others chr6   
##  7 ENST00000002501.11  -784.              2056          0.381 others chr16  
##  8 ENST00000002596.6  -1885.              7160          0.263 others chr4   
##  9 ENST00000003100.13  -803.              3155          0.254 others chr7   
## 10 ENST00000003583.12  -750.              2544          0.295 others chr1   
## # ℹ 19,776 more rows
## # ℹ 27 more variables: source_tag <chr>, primary_tag <chr>, start <dbl>,
## #   end <dbl>, score <chr>, strand <dbl>, frame <chr>, artif_dupl <chr>,
## #   ccdsid <chr>, exon_id <chr>, exon_number <chr>, gene_id <chr>,
## #   gene_name <chr>, gene_type <chr>, havana_gene <chr>,
## #   havana_transcript <chr>, hgnc_id <chr>, ID <chr>, level <dbl>, ont <chr>,
## #   Parent <chr>, protein_id <chr>, tag <chr>, transcript_name <chr>, …
structuredness_Espresso_AsPC1_RNAs_filtered_wilcox <- 
  structuredness_Espresso_AsPC1_RNAs_filtered |> 
  filter(genetype2 != 'Mt_rRNA') |> 
  group_by(genetype2) |> 
  rstatix::wilcox_test(structuredness ~ m3CRNA, ref = 'others') |> 
  rstatix::add_y_position()

structuredness_Espresso_AsPC1_RNAs_m3C_sinaplot <- 
  structuredness_Espresso_AsPC1_RNAs_filtered |> 
  ggplot(aes(y = structuredness, x = m3CRNA, colour = m3CRNA)) +
  ggforce::geom_sina(size = 1) +
  geom_boxplot(width = .5) +
  scale_color_manual(values = c('red', 'gray30')) +
  ggpubr::stat_pvalue_manual(
    data = structuredness_Espresso_AsPC1_RNAs_filtered_wilcox, 
    tip.length = 0, coord.flip = TRUE
  ) +
  coord_flip() +
  facet_wrap( ~ genetype2, ncol = 1)
structuredness_Espresso_AsPC1_RNAs_m3C_sinaplot |> 
  ggsave_multiple_formats(
    width = 4.5, height = 6, fontsize = 7, outdir = figdir
  )

structuredness_Espresso_AsPC1_RNAs |> 
  filter(genetype2 == 'mRNA') |> 
  ggplot(aes(x = structuredness, colour = m3CRNA)) +
  stat_ecdf() +
  #geom_boxplot() +
  #geom_density() +
  facet_wrap( ~ transcript_type)
## Warning: Removed 141469 rows containing non-finite values (`stat_ecdf()`).

Bind data of all C and m3C sites

Structure information grouped by transcript type

m3C_allC_structureinfo <- 
  m3C_sites |> 
  select_cols() |> 
  mutate(type = 'm3C') |> 
  bind_rows(methylated_RNAs_C_positions |> select_cols() |> mutate(type = 'allC')) |> 
  reorder_genetype2()
m3C_allC_structureinfo
## # A tibble: 24,606 × 7
##    transcript_id  gene_name genetype2 position pos_dotbracket pos_pairprob type 
##    <chr>          <chr>     <fct>        <dbl> <chr>          <chr>        <chr>
##  1 ENST000004297… RPL32     mRNA           425 (              (            m3C  
##  2 ENST000006472… RPL35A    mRNA           383 (              |            m3C  
##  3 ENST000006472… RPL35A    mRNA           384 (              |            m3C  
##  4 ENST000003896… MT-RNR1   Mt_rRNA         60 (              (            m3C  
##  5 ENST000003896… MT-RNR1   Mt_rRNA         78 (              (            m3C  
##  6 ENST000003896… MT-RNR1   Mt_rRNA         96 .              .            m3C  
##  7 ENST000003896… MT-RNR1   Mt_rRNA        151 .              .            m3C  
##  8 ENST000003896… MT-RNR1   Mt_rRNA        156 (              {            m3C  
##  9 ENST000003896… MT-RNR1   Mt_rRNA        157 (              {            m3C  
## 10 ENST000003896… MT-RNR1   Mt_rRNA        158 .              .            m3C  
## # ℹ 24,596 more rows

Structure information grouped by transcript type and region

m3C_allC_structureinfo_mRNAs <- 
  DRS_methylated_positions_CDSpos |> 
  add_structureinfo_mRNAs() |> 
  mutate(type = 'm3C') |> 
  bind_rows(
    allC_methylatedRNAs_regioninfo |> add_structureinfo_mRNAs() |> mutate(type = 'allC')
  ) |> 
  reorder_genetype2()
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
m3C_allC_structureinfo_mRNAs
## # A tibble: 22,770 × 8
##    transcript_id     gene_name genetype2 kmer_region position pos_dotbracket
##    <chr>             <chr>     <fct>     <chr>          <dbl> <chr>         
##  1 ENST00000429711.7 RPL32     mRNA      CDS              425 (             
##  2 ENST00000647248.2 RPL35A    mRNA      CDS              383 (             
##  3 ENST00000647248.2 RPL35A    mRNA      CDS              384 (             
##  4 ENST00000361390.2 MT-ND1    mt-mRNA   CDS               35 .             
##  5 ENST00000361390.2 MT-ND1    mt-mRNA   CDS              125 (             
##  6 ENST00000361390.2 MT-ND1    mt-mRNA   CDS              143 )             
##  7 ENST00000361390.2 MT-ND1    mt-mRNA   CDS              188 .             
##  8 ENST00000361390.2 MT-ND1    mt-mRNA   CDS              207 .             
##  9 ENST00000361390.2 MT-ND1    mt-mRNA   CDS              262 .             
## 10 ENST00000361390.2 MT-ND1    mt-mRNA   CDS              324 .             
## # ℹ 22,760 more rows
## # ℹ 2 more variables: pos_pairprob <chr>, type <chr>

Calculate % of paired or unpqired bases

Grouped by genetype

m3C_allC_pairprob_percent <- 
  m3C_allC_structureinfo |> 
  group_by(pos_pairprob, genetype2, type) |> 
  reframe(n = n()) |> 
  group_by(genetype2, type) |> 
  mutate(percent = 100 * n / sum(n)) |> 
  arrange(genetype2)
m3C_allC_pairprob_percent
## # A tibble: 50 × 5
## # Groups:   genetype2, type [8]
##    pos_pairprob genetype2 type      n percent
##    <chr>        <fct>     <chr> <int>   <dbl>
##  1 (            mRNA      allC   5507   26.8 
##  2 (            mRNA      m3C      58   22.6 
##  3 )            mRNA      allC   5409   26.3 
##  4 )            mRNA      m3C      67   26.1 
##  5 ,            mRNA      allC   1418    6.90
##  6 ,            mRNA      m3C      21    8.17
##  7 .            mRNA      allC   5199   25.3 
##  8 .            mRNA      m3C      73   28.4 
##  9 {            mRNA      allC   1046    5.09
## 10 {            mRNA      m3C      12    4.67
## # ℹ 40 more rows

Grouped by genetype and region

m3C_allC_pairprob_percent_mRNAs <- 
  m3C_allC_structureinfo_mRNAs |> 
  group_by(pos_pairprob, genetype2,kmer_region,  type) |> 
  reframe(n = n()) |> 
  group_by(genetype2, kmer_region, type) |> 
  mutate(percent = 100 * n / sum(n)) |> 
  arrange(genetype2)
m3C_allC_pairprob_percent_mRNAs
## # A tibble: 55 × 6
## # Groups:   genetype2, kmer_region, type [8]
##    pos_pairprob genetype2 kmer_region   type      n percent
##    <chr>        <fct>     <chr>         <chr> <int>   <dbl>
##  1 (            mRNA      CDS           allC   2068   27.6 
##  2 (            mRNA      CDS           m3C      43   24.0 
##  3 (            mRNA      fiveprimeUTR  allC    737   39.9 
##  4 (            mRNA      fiveprimeUTR  m3C      11   36.7 
##  5 (            mRNA      threeprimeUTR allC   2431   23.9 
##  6 (            mRNA      threeprimeUTR m3C       4    8.89
##  7 )            mRNA      CDS           allC   1835   24.5 
##  8 )            mRNA      CDS           m3C      45   25.1 
##  9 )            mRNA      fiveprimeUTR  allC    198   10.7 
## 10 )            mRNA      fiveprimeUTR  m3C       5   16.7 
## # ℹ 45 more rows

Calculate % (in dotbracket)

Grouped by genetype

m3C_allC_dotbracket_percent <- 
  m3C_allC_structureinfo |> 
  group_by(pos_dotbracket, genetype2, type) |> 
  reframe(n = n()) |> 
  group_by(genetype2, type) |> 
  mutate(percent = 100 * n / sum(n)) |> 
  arrange(genetype2)
m3C_allC_dotbracket_percent |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/RNAfold/m3C_allC_dotbracket_percent_2024-07-29.tsv
## # A tibble: 23 × 5
## # Groups:   genetype2, type [8]
##    pos_dotbracket genetype2 type      n percent
##    <chr>          <fct>     <chr> <int>   <dbl>
##  1 (              mRNA      allC   6892   33.5 
##  2 (              mRNA      m3C      75   29.2 
##  3 )              mRNA      allC   6719   32.7 
##  4 )              mRNA      m3C      87   33.9 
##  5 .              mRNA      allC   6937   33.8 
##  6 .              mRNA      m3C      95   37.0 
##  7 (              mt-mRNA   allC    405   14.4 
##  8 (              mt-mRNA   m3C      11    6.04
##  9 )              mt-mRNA   allC    413   14.7 
## 10 )              mt-mRNA   m3C      26   14.3 
## # ℹ 13 more rows

Grouped by genetype and region

m3C_allC_dotbracket_percent_mRNAs <- 
  m3C_allC_structureinfo_mRNAs |> 
  group_by(pos_dotbracket, genetype2,kmer_region,  type) |> 
  reframe(n = n()) |> 
  group_by(genetype2, kmer_region, type) |> 
  mutate(percent = 100 * n / sum(n)) |> 
  arrange(genetype2)
m3C_allC_dotbracket_percent_mRNAs |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/RNAfold/m3C_allC_dotbracket_percent_mRNAs_2024-07-29.tsv
## # A tibble: 24 × 6
## # Groups:   genetype2, kmer_region, type [8]
##    pos_dotbracket genetype2 kmer_region   type      n percent
##    <chr>          <fct>     <chr>         <chr> <int>   <dbl>
##  1 (              mRNA      CDS           allC   2602    34.7
##  2 (              mRNA      CDS           m3C      53    29.6
##  3 (              mRNA      fiveprimeUTR  allC    900    48.8
##  4 (              mRNA      fiveprimeUTR  m3C      12    40  
##  5 (              mRNA      threeprimeUTR allC   3055    30.0
##  6 (              mRNA      threeprimeUTR m3C      10    22.2
##  7 )              mRNA      CDS           allC   2350    31.4
##  8 )              mRNA      CDS           m3C      61    34.1
##  9 )              mRNA      fiveprimeUTR  allC    303    16.4
## 10 )              mRNA      fiveprimeUTR  m3C       7    23.3
## # ℹ 14 more rows

Plot

Dotbracket

m3C_allC_dotbracket_percent_barplot <- 
  m3C_allC_dotbracket_percent |> 
  mutate(pos_dotbracket2 = ifelse(pos_dotbracket == '.', pos_dotbracket, '()')) |> 
  filter(!is.na(genetype2)) |> 
  ggplot(aes(
    x = interaction(type, genetype2), y = percent, fill = pos_dotbracket2
  )) +
  geom_bar(stat = 'identity') +
  scale_y_reverse() +
  coord_flip()
m3C_allC_dotbracket_percent_barplot |> 
  ggsave_multiple_formats(
    width = 4.5, height = 6, fontsize = 7, outdir = figdir
  )

m3C_allC_dotbracket_percent_mRNAs_barplot <- 
  m3C_allC_dotbracket_percent_mRNAs |> 
  filter(genetype2 == 'mRNA') |> 
  reorder_kmer_region() |> 
  mutate(pos_dotbracket2 = ifelse(pos_dotbracket == '.', pos_dotbracket, '()')) |> 
  filter(!is.na(genetype2)) |> 
  ggplot(aes(
    x = interaction(type, kmer_region), y = percent, fill = pos_dotbracket2
  )) +
  geom_bar(stat = 'identity') +
  scale_y_reverse() +
  coord_flip()
m3C_allC_dotbracket_percent_mRNAs_barplot |> 
  ggsave_multiple_formats(
    width = 4.5, height = 6, fontsize = 7, outdir = figdir
  )

pair probability

m3C_allC_pairprob_percent |> 
  mutate(pairprob = case_when(
    pos_pairprob %in% c('(', ')') ~ 'weakly paired with preference',
    pos_pairprob %in% c('{', '}') ~ 'strongly paired with preference',
    pos_pairprob == '.' ~ 'essentially unpaired',
    pos_pairprob == ',' ~ 'weakly paired without preference',
    pos_pairprob == '|' ~ 'strongly paired without preference',
    .default = NA
  )) |> 
  filter(!is.na(genetype2)) |> 
  ggplot(aes(x = interaction(type, genetype2), y = percent, fill = pairprob)) +
  geom_bar(stat = 'identity') +
  coord_flip()

m3C_allC_pairprob_percent_mRNAs |> 
  mutate(pairprob = case_when(
    pos_pairprob %in% c('(', ')') ~ 'weakly paired with preference',
    pos_pairprob %in% c('{', '}') ~ 'strongly paired with preference',
    pos_pairprob == '.' ~ 'essentially unpaired',
    pos_pairprob == ',' ~ 'weakly paired without preference',
    pos_pairprob == '|' ~ 'strongly paired without preference',
    .default = NA
  )) |> 
  filter(!is.na(genetype2)) |> 
  ggplot(aes(x = interaction(type, genetype2, kmer_region), y = percent, fill = pairprob)) +
  geom_bar(stat = 'identity') +
  coord_flip()